/* This dofile is to match follow-up survey outcome and tvet admission data for grade 12 students with the current database */

// import dataset 
clear
import excel "$raw/student/followup survey grade 12/Follow-up_questions_on_final_decisions_20220828", clear firstrow

g indexnumber = _n

ren Studentcode studentid 
ren Schoolname schoolname 
ren Studentname name
ren CitizenshipIDNo citizenid
ren Didthestudentapplyforcol i_appcollege
ren stchoice i_appcollege_major1
ren ndchoice i_appcollege_major2
ren rdchoice i_appcollege_major3
ren ifothersPleasespecify i_appcollege_major_others

ren Wasthestudentadmittedto i_admcollege
ren Whichcollegemajorwasthe i_admcollege_major
ren S i_admcollege_major_others

ren Didthestudentapplyforte i_apptvet
ren U i_admtvet

ren Whatwasthestudentsfinal i_findecision
ren ifotherspleasespecify i_findecision_others
ren NameofthecollegeorTTII i_institute
ren IfOthersPleaseSpecify i_institute_others

rename *, lower   // three repeated var

replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace schoolname = trim(itrim(lower(schoolname)))
tab schoolname

// compare schoolname with our database and fix those inconsistent
replace schoolname = "baylling central school" if regexm(schoolname, "baylling")
replace schoolname = "chukha central school" if regexm(schoolname, "chukha")
replace schoolname = "damphu central school" if regexm(schoolname, "damphu|dampnu")
replace schoolname = "desi higher secondary school" if regexm(schoolname, "desi")
replace schoolname = "gaselo central school" if regexm(schoolname, "gaselo")
replace schoolname = "jigme sherubling central school" if regexm(schoolname, "jigme")
replace schoolname = "kelki higher secondary school" if regexm(schoolname, "kelki")
replace schoolname = "motithang higher secondary school" if regexm(schoolname, "motithang")
replace schoolname = "pelkhil higher secondary school" if regexm(schoolname, "pelkhil")
replace schoolname = "punakha central school" if regexm(schoolname, "phunakha")
replace schoolname = "shaba higher secondary school" if regexm(schoolname, "shaba")
replace schoolname = "tsenkharla central school" if regexm(schoolname, "tsenkharla")
replace schoolname = "wangbama central school" if regexm(schoolname, "wangbama")
replace schoolname = "yangchenphug higher secondary school" if regexm(schoolname, "yangchenphu|yangchenphug") 
replace schoolname = "yoezerling higher secondary school" if regexm(schoolname, "yoerzerling|yoezerling")
replace schoolname = "" if regexm(schoolname, "bachelors") 

replace name = "Jigme singye" if studentid == "Jigme singye" & name == "201.00380.11.0079"
replace studentid = "201.00380.11.0079" if name == "Jigme singye"  & citizenid == "11601002048"

replace name = "sonam Peldon" if studentid == "sonam Peldon" & name == "201.00398.11.0038"
replace studentid = "201.00398.11.0038" if studentid == "sonam Peldon" & citizenid == "11607001647"

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses


gen dateofbirth1 = date(dateofbirth, "DMY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "Male"
replace sex = 2 if gender == "Female"
replace sex = 3 if gender == "Other "

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

// correct for typos: this is done after finishing step 3, then manually check 

replace studentid = "204.00016.12.0002" if studentid == "204.00016.12.002" & name == "chatrabhadurchhetri"
replace studentid = "201.00059.11.0034" if studentid == "201.00059.11 0034" & name == "kuensumkuendreldrelma"
replace studentid = "201.00045.11.0055" if studentid == "201.00045.11.0054" & name == "phuntshozangmo"
replace studentid = "201.00029.11.0109" if studentid == "201.00029.11 0109" & name == "sangitarai"
replace studentid = "201.00403.11.0112" if studentid == "201.00403.11.00112"
replace studentid = "201.00452.11.0317" if studentid == "201.00049.11.0026" & sex == 1 & dateofbirth == "20030314" & name == "sonamyoezer"

replace studentid = "201.00315.11.0100" if studentid == "201.00315.11.01000" & name == "dechenpenjor"
replace citizenid = "10705005142" if studentid == "201.00269.11.0138" & citizenid == "10705005141"
replace studentid = "201.00010.11.3615" if studentid == "201.00010.113615" & name == "snehasubba"
replace studentid = "201.00154.11.0028" if studentid == "201.00154.110028"
replace studentid = "202.00017.11.0127" if studentid == "202.00017.0127"
replace studentid = "201.00296.11.0181" if studentid == "201.00396.11.0181" & name == "nirutalepcha"

replace studentid = "201.00296.11.0127" if studentid == "201.00396.11.01127" & name == "pemadechen"
replace studentid = "201.00288.11.0049" if studentid == "23.2.2002" & name == "sarasatidarjee"
replace studentid = "201.00312.11.0205" if studentid == "201.00312.11.9205" & name == "tandintshering"

replace citizenid = "11805000979" if studentid == "201.00436.11.0078" & citizenid == "11805000797"
replace studentid = "201.00524.11.0106" if studentid == "201.00436.11.0078" & citizenid == "11805002000"

replace studentid = "201.00301.11.0264" if studentid == "201..00301.11.0264" & name == "tsheringchodenghalley"

replace studentid = "201.00286.11.0326" if studentid == "201.00386.11.0326" & name == "namgayrinchen"
replace studentid = "201.00306.11.0065" if name == "ngawangnorbu" & schoolname == "pelkhil higher secondary school" & dateofbirth == "20020924"

replace citizenid = "11008000387" if studentid == "201.00457.11.0051" & name == "tashilhamo" & dateofbirth == "20040722"

// remove duplicated observations 
sort schoolname name studentid sex 
g dup = 0
forval i=1/3 {
replace dup = 1 if schoolname == schoolname[_n-`i'] & name == name[_n-`i'] & sex == sex[_n-`i'] & studentid==studentid[_n-`i']
}

// save the list of duplicated observations and send to Cheku
preserve 
duplicates tag schoolname name sex studentid, g(duplicatedobservations)
tab duplicatedobservations
keep if duplicatedobservations ~= 0
keep start end enumeratorid schoolname name studentid citizenid gender phonenumber
export excel using "$temp/File1_grade12_degree_data_duplicatedobservations.xlsx", replace firstrow(variables)
restore 

keep if dup == 0

keep indexnumber name studentid citizenid b_* dateofbirth sex schoolname i_appcollege i_admcollege i_appcollege_major1 i_appcollege_major2 i_appcollege_major3 i_appcollege_major_others i_admcollege i_admcollege_major i_admcollege_major_others i_apptvet i_admtvet i_findecision i_findecision_others i_institute i_institute_others
foreach variable in schoolname name studentid citizenid dateofbirth sex b_month b_day b_year{
	rename `variable' `variable'_s 
}

save "$temp/tvetchoice.dta", replace 


*********************************************************************************
*********     clean tvet admission data
*********************************************************************************
clear
import excel "/Users/ye/Downloads/TVET Trainee applicants.xlsx", clear firstrow

g tvetindexnumber = _n

ren app_name name

ren cid_no citizenid
ren dob dateofbirth
ren email_id email
ren contact_no phone
ren present_dzongkhag_name dzongkhag
g i_admtvet = "Yes" 

rename *, lower   // three repeated var

replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace institute_name = trim(itrim(lower(institute_name)))
tab institute_name

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

gen dateofbirth1 = date(dateofbirth, "MDY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "M"
replace sex = 2 if gender == "F"
replace sex = 3 if gender == "Other"

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

// remove duplicated observations 
sort institute_name name citizenid sex 
g dup = 0
forval i=1/3 {
replace dup = 1 if institute_name == institute_name[_n-`i'] & name == name[_n-`i'] & sex == sex[_n-`i'] & citizenid==citizenid[_n-`i']
}

keep if dup == 0


foreach variable in name citizenid dateofbirth sex b_month b_day b_year{
	rename `variable' `variable'_t 
}

save "$temp/tvetadm.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 12 SURVEY DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON STUDENT ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria studentid  

* open the tvet choice data 
use "$temp/tvetchoice.dta", clear 
rename studentid_s studentid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename studentid studentid_s
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/tvet_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format studentid_b %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data12.dta", replace 


use "$temp/data12.dta", clear
rename studentid_b studentid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename studentid studentid_b 
save $temp/data12_step1d, replace

* merging two unique data together using fully matched student ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data12_step1u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)
sum namescore if _merge==3

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3
replace _merge = 3 if studentid == "201.00524.11.0065" // missing date of birth for follow up survey and matched otherwise 
replace _merge = . if studentid == "201.00323.11.0147"

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber
save $temp/tvet_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step1d
save $temp/tvet_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step1d
save $temp/data12_step2b, replace 
restore 


// STEP 2: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria citizenid 

* open the tvet choice data 
use $temp/tvet_step2b.dta, clear 
rename citizenid_s citizenid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_s
save $temp/tvet_step2d, replace


* open the survey data 
use $temp/data12_step2b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step2d, replace


* merging two unique data together using fully matched citizenship ID & check fuzzy name 
use $temp/tvet_step2u
merge 1:1 $criteria using $temp/data12_step2u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)

sum namescore if _merge==3

replace _merge = . if namescore < 0.7 & bscore < 0.9 & _merge == 3
replace _merge = 3 if name_s == "pematshoki" & studentid_s == "201.00382.11.0282"
replace _merge = 3 if studentid_s == "201.00323.11.0147" & citizenid == "11607003007" 

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber 
save $temp/tvet_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step2d
save $temp/tvet_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step2d
save $temp/data12_step3b, replace 
restore  



// STEP 3: PERFECT MATCHING ON SEX + DATE OF BIRTH
********************************************************************************
gl criteria sex dateofbirth    

* open the tvet choice data 
use $temp/tvet_step3b.dta, clear 

rename dateofbirth_s dateofbirth 
rename sex_s sex

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap ren dateofbirth dateofbirth_s
cap ren sex sex_s
save $temp/tvet_step3d, replace


* open the survey data 
use $temp/data12_step3b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step3d, replace


* merging two unique data together 
use $temp/tvet_step3u, clear
merge 1:1 $criteria using $temp/data12_step3u
matchit name_s name, g(namescore)
matchit studentid_s studentid_b, g(idscore)
matchit citizenid_s citizenid, g(cidscore)

br name name_s *score if  _merge==3
sort name name_s

* correct if names are not matched, leave for the next round of matching
replace _merge = . if citizenid_s == "10204001407" & studentid_s == "201.00031.11.1234"

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid
unique indexnumber  
save $temp/tvet_step3m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step3d
save $temp/tvet_step4b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step3d
save $temp/data12_step4b, replace 
restore  

// Match from tvet admission data 
// STEP 4: PERFECT MATCHING ON citizenid
********************************************************************************
gl criteria citizenid  

* open the tvet choice data 
use $temp/tvetadm.dta, clear 

rename citizenid_t citizenid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step4u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap ren citizenid citizenid_t
save $temp/tvet_step4d, replace


* open the survey data 
use $temp/data12_step4b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step4u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step4d, replace


* merging two unique data together 
use $temp/tvet_step4u, clear
merge 1:1 $criteria using $temp/data12_step4u   // 21 matched
matchit name_t name, g(namescore)
matchit dateofbirth_t dateofbirth, g(dobscore)

br name name_t *score if  _merge==3
sort name name_t

* correct if names are not matched, leave for the next round of matching
//replace _merge = . if citizenid_s == "10204001407" & studentid_s == "201.00031.11.1234"

* keep matched data 
preserve 
keep if _merge==3 
keep tvetindexnumber uniqueid
unique tvetindexnumber  
save $temp/tvet_step4m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep tvetindexnumber 
merge 1:1 tvetindexnumber using $temp/tvetadm.dta, keep(matched) nogen 
append using $temp/tvet_step4d
save $temp/tvet_step5b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_b schoolname_b 
merge 1:1 responseid_b schoolname_b using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step4d
save $temp/data12_step5b, replace 
restore  



// combine data together 
clear 
append using $temp/tvet_step1m
g tvet_match_criteria = "perfect(studentID) + fuzzy(name/dob)"

append using $temp/tvet_step2m
replace tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)" if missing(tvet_match_criteria)

append using $temp/tvet_step3m
replace tvet_match_criteria = "perfect(dob&sex) + fuzzy(studentID|CID|name)" if missing(tvet_match_criteria)
count

merge 1:1 indexnumber using "$temp/tvetchoice.dta", keep(matched) keepusing(i_*) nogen
save $temp/tvetchoicemerged, replace   //4022 matched


clear 
append using $temp/tvet_step4m
g tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)"
g adm_data = 1
lab var adm_data "1/0 tvet admission data available"
count

merge 1:1 tvetindexnumber using "$temp/tvetadm.dta", keep(matched) keepusing(*) nogen
save $temp/tvetadmmerged, replace      //21 matched
append using $temp/tvetchoicemerged
replace adm_data = 0 if missing(adm_data)

tab tvet_match_criteria

merge 1:1 uniqueid using "$temp/grade12_analysis" 
g tvet_data = 1 if _merge == 3 
replace tvet_data = 0 if _merge == 2
drop _merge 
lab var tvet_data "1/0 tvet data available"

g tvet_institute = 1 if i_admtvet == "Yes" 
replace tvet_institute = 0 if i_admtvet ~= "Yes" & ~missing(i_appcollege) // ?
lab var tvet_institute "1/0 enrolled in tvet institute after grade 12"

save "$clean/grade12_analysis.dta", replace 
